# 将中文wiki语料分词
import jieba
import re

jieba.enable_paddle()  # 开启paddle模式


def get_tokenizer(line0, line1):
    sources = jieba.cut(line0, use_paddle=True)
    targets = jieba.cut(line1, use_paddle=True)
    source = " ".join(list(sources))
    target = " ".join(list(targets))
    return source + "\t" + target + "\n"


def handle(infile, outfile):
    f2 = open(outfile, 'a+')
    with open(infile, "r") as fr:
        lines = fr.readlines()
        i = 0
        for line in lines:
            i += 1
            if i % 1000 == 0:
                print("执行了 %d 条" % i)
            line_list = line.strip().split("\t")
            str_result = get_tokenizer(line_list[0], line_list[1])
            f2.write(str_result)


if __name__ == "__main__":
    infile = '/data/corpus_hj/synonymous_validation.txt'
    outfile = '/data/wiki-split-master/synonymous_validation.tsv'
    handle(infile, outfile)
